import sklearn.metrics
from sklearn.datasets import fetch_20newsgroups
Since there exists a nice example for explaining multi-class text classifiers with lime , I will use this as basis for my example.
This example is based on the famous 20 newsgroup dataset.
dataset_complete = fetch_20newsgroups()
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
# making class names shorter
class_names = [x.split('.')[-1] if 'misc' not in x else '.'.join(x.split('.')[-2:]) for x in newsgroups_train.target_names]
class_names[3] = 'pc.hardware'
class_names[4] = 'mac.hardware'
I use the same classifier as the Lime example. Therefore, a tfidf vectorizer is used to train a Multinomial Naive Bayes for classification.
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(newsgroups_train.data)
test_vectors = vectorizer.transform(newsgroups_test.data)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=.01)
nb.fit(train_vectors, newsgroups_train.target)
pred = nb.predict(test_vectors)
sklearn.metrics.f1_score(newsgroups_test.target, pred, average='weighted')
As shown, the classifier provides a suitabe F-Score, However, thei classifier is to be susceptible for overfitting on this dataset (https://scikit-learn.org/stable/datasets/#filtering-text-for-more-realistic-training).
To do so, an explainer instance has to be created and a document within our test data-set has to be chosen.
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)
# choose the instance
idx = 1340
# explain the instance classification
exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, labels=[0, 17])
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[newsgroups_test.target[idx]])
Now, the explanation can be looked at in detail:
print ('Explanation for class %s' % class_names[0])
print ('\n'.join(map(str, exp.as_list(label=0))))
print ()
print ('Explanation for class %s' % class_names[17])
print ('\n'.join(map(str, exp.as_list(label=17))))
And we can visualize the explanation to be more user friendly. Here we only visualize the explanasion for the top class.
exp.show_in_notebook(text=False, labels=exp.available_labels()[:1])
To demonstrate the new version of the Lime explainer based on topics, the same classifier and dataset will be used.
A well known approach to topic modelling is the Latent Dirichlet Allocation (LDA). To generate a good topic model the text-corpus on which the LDA should be generated should be preprocessed. Since, LDA is a statistical approach, that looks at the appreance of words within documents, the words should be transformed into their lemmatized form.
The new Lime text explainer, includes a simple class that can be used for preprocessing a text corpus.
from src.lime_text_topics import PreProcessor
# Preprocess document within dataset
# includes removing of unnecessary characters and lemmatization
data_processor = PreProcessor()
To create topics using LDA, the implementation within gensim is used. Here we create 20 topics using the complete 20 newsgroup dataset.
Since creating a LDA takes some time, a previously trained LDA loaded
import gensim
from gensim.test.utils import datapath
from gensim.models import LdaModel
from gensim.corpora import Dictionary
# Save model to disk.
temp_file = datapath("secretLocation/src/examples/models/20_newsgroup/lda_20newsgroup_20topics.model")
# Load a potentially pretrained model from disk.
lda_model = LdaModel.load(temp_file)
topics = []
for x in range(lda_model.num_topics):
topics.append('topic #'+str(x))
# and load the dictionary for the LDA_model
id2word = Dictionary.load_from_text('secretLocation/src/examples/models/20_newsgroup/20_newsgroup_dict')
We can look at the topics created by th LDA
print(f'Number of topics: {lda_model.num_topics}')
# print top words for one topic
lda_model.print_topics(1)
Unfortunatly, LDA considers all words to be present in all topics. Moreover, the model was build on a lemmatized version of the dataset. Therefore, a function that maps words to words within the LDA mus be created that can be passed on to the new Lime approach. It will map a word to its lemmatized form and reeturn a list of all topics in which the word is present with a higher probability than a defined minimum probability
minimum_probability=0.002
def word_to_topics(word):
"""
Maps a word on its corresponding topics
:param word: the word that is searched for
:return: list of topics
"""
word_lem_list = data_processor.lemmatize(word)
if len(word_lem_list)>0:
word_lem = word_lem_list[0]
if word_lem in id2word.token2id:
word_id = id2word.token2id[word_lem]
return [x[0] for x in lda_model.get_term_topics(word_id,minimum_probability=minimum_probability)]
else:
return []
else:
return []
Now we can use this function to explain the classifier with the newly generated topics.
from src.lime_text_topics import LimeTextByTopicsExplainer
explainer_mod = LimeTextByTopicsExplainer(class_names=class_names, word_to_topics=word_to_topics, topics=topics)
exp_mod = explainer_mod.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, top_labels=2)
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s\n' % class_names[newsgroups_test.target[idx]])
for x in exp_mod.available_labels():
print ('Explanation for class %s' % class_names[x])
print ('\n'.join(map(str, exp_mod.as_list(label=x))))
print ()
and again we can visualize the outcome for the user:
exp_mod.show_in_notebook(text=False, labels=exp_mod.available_labels())
We can also show create a fake topic that includes all words that are not included in the LDA.
explainer_mod_all_words = LimeTextByTopicsExplainer(class_names=class_names, word_to_topics=word_to_topics, topics=topics, consider_all_words=True)
exp_mod_all_words = explainer_mod_all_words.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, top_labels=2)
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s\n' % class_names[newsgroups_test.target[idx]])
for x in exp_mod_all_words.available_labels():
print ('Explanation for class %s' % class_names[x])
print ('\n'.join(map(str, exp_mod_all_words.as_list(label=x))))
print ()
exp_mod_all_words.show_in_notebook(text=False, labels=exp_mod_all_words.available_labels())
A useful alternative to using an LDA on the same dataset might be training an LDA on another Dataset.
Since we want to represent as many words as possible in our LDA, a useful dataset needs to be composed of a huge number of words. Therefore, I will now demonstrate how the complete Wikipedia can be used to create topics and explain a classifier classificaion.
Load a previously trained LDA, that was created using the complete wikipedia. An detailed instruction on how to create an LDA using the wikipedia dump can be found here.
Again, a previously created LDA is loaded
from gensim.test.utils import datapath
from gensim.models import LdaModel
# Save model to disk.
temp_file_wiki = datapath("secretLocatio/src/examples//models/wiki/lda_wiki_100topics.model")
# Load a potentially pretrained model from disk.
lda_model_wiki = LdaModel.load(temp_file_wiki)
topics_wiki = []
for x in range(lda_model_wiki.num_topics):
topics_wiki.append('topic #'+str(x))
To use this LDA we have to rewrite our word to topic mapping. Again this is done in the same was as before and we keep the same minimal probabilty. Howerver, since we load an LDA moder we also need the dictornary that was used to create the model.
# load id->word mapping (the dictionary used to build LDA model )
dictionary = gensim.corpora.Dictionary.load_from_text('secretLocation/src/examples//models/wiki/_wordids.txt.bz2')
minimum_probability_wiki = 0.0004
def word_to_topics_wiki(word):
"""
Maps a word on its corresponding topics
:param word: the word that is searched for
:return: list of topics
"""
# watch out to use same lemmatization as used in LDA
word_lem_list = data_processor.lemmatize(word)
if len(word_lem_list)>0:
word_lem = word_lem_list[0]
if word_lem in dictionary.token2id:
word_id = dictionary.token2id[word_lem]
return [x[0] for x in lda_model_wiki.get_term_topics(word_id,minimum_probability=minimum_probability_wiki)]
else:
return []
else:
return []
create an explanasion
explainer_wiki = LimeTextByTopicsExplainer(class_names=class_names, word_to_topics=word_to_topics_wiki, topics=topics_wiki)
exp_wiki = explainer_wiki.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, top_labels=2)
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s\n' % class_names[newsgroups_test.target[idx]])
for x in exp_wiki.available_labels():
print ('Explanation for class %s' % class_names[x])
print ('\n'.join(map(str, exp_wiki.as_list(label=x))))
print ()
exp_wiki.show_in_notebook(text=False, labels=exp_wiki.available_labels())
or create an explanation with all words that could not have been mapped
explainer_wiki_all = LimeTextByTopicsExplainer(class_names=class_names, word_to_topics=word_to_topics_wiki, topics=topics_wiki, consider_all_words=True)
exp_wiki_all = explainer_wiki_all.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6, top_labels=2)
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s\n' % class_names[newsgroups_test.target[idx]])
for x in exp_wiki_all.available_labels():
print ('Explanation for class %s' % class_names[x])
print ('\n'.join(map(str, exp_wiki_all.as_list(label=x))))
print ()
exp_wiki_all.show_in_notebook(text=False, labels=exp_wiki.available_labels())